df=read.csv("~/Desktop/CU 2016 Spr/DV CU/Final_proj/final_proj/311_15.csv",header = TRUE)
date_df = df[order(as.Date(df$Closed.Date, format="%m/%d/%Y"),decreasing=TRUE),]
df_open=df[df$Status=="Open",]
df$date_diff <- as.Date(as.character(df$Closed.Date),format="%m/%d/%Y")-as.Date(as.character(df$Created.Date), format="%m/%d/%Y")
df_hl_diff = df[df$Complaint.Type=='Homeless Encampment',1:54]
get_bc_map = function(df) {
lon = df$Longitude
lat = df$Latitude
bc_bbox <- make_bbox(lat = lat, lon = lon)
bc_big <- get_map(location = bc_bbox, source = "google", maptype = "terrain")
return (bc_big)
}
bc_map = get_bc_map(df_hl_diff)
## converting bounding box to center/zoom specification. (experimental)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=40.710281,-73.983187&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
df_hl_diff$date_diff = as.numeric(df_hl_diff$date_diff)
ggmap(bc_map)+geom_point(data=df_hl_diff, mapping = aes(x=Longitude,y=Latitude,color = factor(df_hl_diff$date_diff)))+labs(title="Homeless Encampment Issue Solving Time Distribution",x="Longtitude",y="Latitude",colour = "Length of Time")
# df_hl_diff is a df only for HE issue
#levels(df_hl_diff$Agency.Name) #"Internal Affairs Bureau", "New York City PolicDepartment", "NYPD"
# levels(df_hl_diff$Borough)
# levels(df_hl_diff$Descriptor) # 46 features
# levels(df_hl_diff$Location.Type) # 19 remove "" remember
df_pred = df_hl_diff[,c("Address.Type","Borough","Location.Type","date_diff")]
df_pred = df_pred[df_pred$Location.Type!= "",]
df_dummies = dummyVars(~.,data = df_pred)
df_data = as.data.frame(predict(df_dummies, newdata = df_pred)) # dummy df
cut_off_index = nrow(df_data)%/%4
test_data = na.omit(df_data[1:cut_off_index,])
train_data = na.omit(df_data[(nrow(test_data)+1):nrow(df_data),])
names(train_data) <- gsub(" ", ".",names(train_data)) # substitute space and /
names(train_data) <- gsub("/", ".",names(train_data))
names(test_data) <- gsub(" ", ".",names(test_data)) # substitute space and /
names(test_data) <- gsub("/", ".",names(test_data))
rf_clf = randomForest(as.factor(date_diff)~.,data=train_data)
print (rf_clf)
##
## Call:
## randomForest(formula = as.factor(date_diff) ~ ., data = train_data)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 12.26%
## Confusion matrix:
## 0 1 2 3 4 class.error
## 0 5196 0 0 0 0 0
## 1 712 0 0 0 0 1
## 2 5 0 0 0 0 1
## 3 8 0 0 0 0 1
## 4 1 0 0 0 0 1
rf_clf$importance
## MeanDecreaseGini
## Address.Type. 0.25043847
## Address.Type.ADDRESS 0.87935544
## Address.Type.BLOCKFACE 0.72448637
## Address.Type.INTERSECTION 0.96615151
## Address.Type.LATLONG 0.00000000
## Address.Type.PLACENAME 0.52634256
## Borough.BRONX 3.34634234
## Borough.BROOKLYN 1.15454622
## Borough.MANHATTAN 4.40276785
## Borough.QUEENS 0.76353095
## Borough.STATEN.ISLAND 1.05288555
## Borough.Unspecified 0.00000000
## Location.Type. 0.00000000
## Location.Type.Bridge 0.16446193
## Location.Type.Club.Bar.Restaurant 0.00000000
## Location.Type.Commercial 0.00000000
## Location.Type.Ferry 0.00000000
## Location.Type.Highway 0.25948062
## Location.Type.House.and.Store 0.00000000
## Location.Type.House.of.Worship 0.00000000
## Location.Type.Park 0.00000000
## Location.Type.Park.Playground 0.96512999
## Location.Type.Parking.Lot 0.00000000
## Location.Type.Residential.Building 0.00000000
## Location.Type.Residential.Building.House 1.91229758
## Location.Type.Roadway.Tunnel 0.03390475
## Location.Type.Store.Commercial 1.41663714
## Location.Type.Street.Sidewalk 0.92909824
## Location.Type.Subway.Station 0.00000000
## Location.Type.Terminal 0.00000000
## Location.Type.Vacant.Lot 0.00000000
pred_y = predict(rf_clf,test_data)
The OOB estimate of error rate is 12.26%, which is unbiased for the test set with the same size as the trainning set. From the importance level, we can see that the factor ‘Borough’ has the biggest impact on the resolution time (Manhattan and Bronx have biggest MeanDecreaseGini). ‘Addresss type’ and ‘location type’ also have impact on resolution time especially the ‘ADDRESS’ type under ‘Addresss type’ and ‘Residential.Building.House’ type under ‘location type’ have high power on predicting our response variable (I also tested that Agency.Name have no impoact. It has 3 levels).
df_test_orig = df_hl_diff[1:cut_off_index,]
df_test_pred = cbind(df_hl_diff[1:cut_off_index,][-ncol(df_hl_diff)],pred_y)
bc_map_test_orig <- get_bc_map(df_test_orig)
## converting bounding box to center/zoom specification. (experimental)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=40.70301,-73.981617&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
ggmap(bc_map_test_orig)+geom_point(data=df_test_orig, mapping = aes(x=Longitude,y=Latitude,color = factor(df_test_orig$date_diff)))+labs(title="Homeless Encampment Issue Solving Time Distribution",x="Longtitude",y="Latitude",colour = "Length of Time")
bc_map_test_pred<- get_bc_map(df_test_pred)
## converting bounding box to center/zoom specification. (experimental)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=40.70301,-73.981617&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
ggmap(bc_map_test_pred)+geom_point(data=df_test_pred, mapping = aes(x=Longitude,y=Latitude,color = factor(df_test_orig$date_diff)))+labs(title="Predicted Homeless Encampment Issue Solving Time Distribution",x="Longtitude",y="Latitude",colour = "Length of Time")